Clustering weekend

library(tidyverse)
library(cluster)
library(factoextra)
library(dendextend)
library(broom)
library(animation)
customers <- read_csv("mall_customers.csv") %>% 
  janitor::clean_names() %>% 
  rename(sex = gender)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   CustomerID = col_double(),
##   Gender = col_character(),
##   Age = col_double(),
##   `Annual Income (k$)` = col_double(),
##   `Spending Score (1-100)` = col_double()
## )
summary(customers)
##   customer_id         sex                 age        annual_income_k 
##  Min.   :  1.00   Length:200         Min.   :18.00   Min.   : 15.00  
##  1st Qu.: 50.75   Class :character   1st Qu.:28.75   1st Qu.: 41.50  
##  Median :100.50   Mode  :character   Median :36.00   Median : 61.50  
##  Mean   :100.50                      Mean   :38.85   Mean   : 60.56  
##  3rd Qu.:150.25                      3rd Qu.:49.00   3rd Qu.: 78.00  
##  Max.   :200.00                      Max.   :70.00   Max.   :137.00  
##  spending_score_1_100
##  Min.   : 1.00       
##  1st Qu.:34.75       
##  Median :50.00       
##  Mean   :50.20       
##  3rd Qu.:73.00       
##  Max.   :99.00
Age
customers <- customers %>% 
  select(-customer_id)

age_stats <- customers %>% 
  summarise(n = n(),
            mean = mean(age),
            sd = sd(age))

customers %>% 
  ggplot() +
  aes(x = age) +
  geom_histogram(aes(y = ..density..), colour = "white", bins = 25) +
  stat_function(
    fun = dnorm,
    args = list(
      mean = age_stats$mean,
      sd = age_stats$sd
    ),
    colour = "red"
  )

customers %>% 
  select(age) %>% 
  mutate(under_40 = age < 40) %>% 
  ggplot() +
  aes(x = under_40) +
  geom_bar()

  • Age is fairly normally distributed, with more customers under 40

  • A small increase at roughly age 65 = recently retired?

annual_income_k
annual_inc_stats <- customers %>% 
  summarise(n = n(),
            mean = mean(annual_income_k),
            sd = sd(annual_income_k))

customers %>% 
  ggplot() +
  aes(x = annual_income_k) +
  geom_histogram(aes(y = ..density..), colour = "white", bins = 25) +
  stat_function(
    fun = dnorm,
    args = list(
      mean = annual_inc_stats$mean,
      sd = annual_inc_stats$sd
    ),
    colour = "red"
  ) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10))

customers %>% 
  mutate(salary_range = case_when(
    annual_income_k < 50 ~ "A_ under 50k",
    annual_income_k >= 100 ~ "C_ over 100k",
    T ~ "B_ 50 to 100k")) %>% 
  ggplot() +
  aes(x = salary_range) +
  geom_bar()

  • Normally-ish distributed, most common incomes roughly 55k - 75k
spending_score_1_100_stats <- customers %>% 
  summarise(n = n(),
            mean = mean(spending_score_1_100),
            sd = sd(spending_score_1_100))

customers %>% 
  ggplot() +
  aes(x = spending_score_1_100) +
  geom_histogram(aes(y = ..density..), colour = "white", bins = 25) +
  stat_function(
    fun = dnorm,
    args = list(
      mean = spending_score_1_100_stats$mean,
      sd = spending_score_1_100_stats$sd
    ),
    colour = "red"
  ) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10))

  • Normalish distribution - peaks around 50 - then rises a bit at the extremes of the scale
sex
customers %>% 
  ggplot() +
  aes(x = sex) +
  geom_bar()

  • More female customers but fairly similar
summary
  • An average customer may be either sex, aged under 40, earning between 50 and 100k and have a spending score of 50.

Scaling

customers_scaled <- customers %>% 
  select(annual_income_k, spending_score_1_100) %>% 
  mutate(annual_income_k = scale(annual_income_k),
         spending_score_1_100 = scale(spending_score_1_100))

Clustering

Elbow
fviz_nbclust(customers_scaled, kmeans, method = "wss", nstart = 25)

  • Hmm not totally convincing. Maybe 3 or 5?

sillhouette

fviz_nbclust(customers_scaled, kmeans, method = "silhouette", nstart = 25)

* Suggests k = 5

fviz_nbclust(customers_scaled, kmeans, method = "gap_stat", nstart = 25, k.max = 10)

  • This says k = 1

  • Hmm beginning to think the data is not suited to clustering

  • I’ll choose k = 3 from the elbow chart

K = 3

customers_3k <- kmeans(customers_scaled, centers = 3, nstart = 25) 

customers_3k
## K-means clustering with 3 clusters of sizes 38, 123, 39
## 
## Cluster means:
##   annual_income_k spending_score_1_100
## 1       1.0066735          -1.22246770
## 2      -0.6246222          -0.01435636
## 3       0.9891010           1.23640011
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3
## [149] 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3 1
## [186] 3 1 3 1 3 1 3 1 3 1 3 1 3 1 3
## 
## Within cluster sum of squares by cluster:
## [1]  20.81189 116.44835  19.65525
##  (between_SS / total_SS =  60.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
## repeating this using k = 2, produced very different clusters each time
## repeating with k = 3 gives the same/similar clusters each time

customers_scaled %>% 
  kmeans.ani(centers = 3)

customers_with_3k_clusters <- augment(customers_3k, customers)
customers_with_3k_clusters %>% 
  ggplot() +
  aes(x = annual_income_k, y = spending_score_1_100, shape = .cluster, colour = age) +
  geom_point() +
  scale_color_continuous(type = "viridis")

customers_with_3k_clusters %>% 
  ggplot() +
  aes(x = annual_income_k, y = spending_score_1_100, shape = .cluster, colour = sex) +
  geom_point()

  • The clusters seems to be roughly equal male/female

  • Cluster 1 = Higher annual income >= 75k, low spending score < 50, noone over 60

  • Cluster 2 = Lower annual income <= 75, range of spending scores, seems to be 3 smaller clusters. Particualrly dense cluster earning between ~ 40 to 70k, with a spending score of 40 - 60 ish. Where most of the people over 65 years old, and also a lot of young people

  • Cluster 3 = high earners, high spenders, all under 40

  • lets try k = 5

k = 5
customers_5k <- kmeans(customers_scaled, centers = 5, nstart = 25) 

customers_5k
## K-means clustering with 5 clusters of sizes 22, 23, 39, 35, 81
## 
## Cluster means:
##   annual_income_k spending_score_1_100
## 1      -1.3262173           1.12934389
## 2      -1.3042458          -1.13411939
## 3       0.9891010           1.23640011
## 4       1.0523622          -1.28122394
## 5      -0.2004097          -0.02638995
## 
## Clustering vector:
##   [1] 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2
##  [38] 1 2 1 2 1 2 5 2 1 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
##  [75] 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5
## [112] 5 5 5 5 5 5 5 5 5 5 5 5 3 4 3 5 3 4 3 4 3 5 3 4 3 4 3 4 3 4 3 5 3 4 3 4 3
## [149] 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3 4
## [186] 3 4 3 4 3 4 3 4 3 4 3 4 3 4 3
## 
## Within cluster sum of squares by cluster:
## [1]  5.217630  7.577407 19.655252 18.304646 14.485632
##  (between_SS / total_SS =  83.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
# gets same/similar clusters every time

customers_scaled %>% 
  kmeans.ani(centers = 5)

customers_with_5k_clusters <- augment(customers_5k, customers)
customers_with_5k_clusters %>% 
  ggplot() +
  aes(x = annual_income_k, y = spending_score_1_100, shape = .cluster, colour = age) +
  geom_point() +
  scale_color_continuous(type = "viridis")

customers_with_5k_clusters %>% 
  ggplot() +
  aes(x = annual_income_k, y = spending_score_1_100, shape = .cluster, colour = sex) +
  geom_point()

  • Looks like 5 pretty distinct clusters.

  • Cluster 1 = low earner low spenders

  • Cluster 2 = High earner low spender

  • Cluster 3 = High earner high spender

  • Cluster 4 = 50/50

  • Cluster 5 = Low earner High spender

  • I’d say the data seems well suited to clustering where k = 5.